Code
import polars as pl
import altair as alt
import collections

alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
Code
df = pl.read_parquet("df.parquet").select(  # reorder some of the columns
    [
        "code",
        "product_name",
        "nutriscore_score",
        "nutriscore_grade",
        "labels_en",
        "categories_en",
        "brands_tags",
        "packaging_en",
        "origins_en",
        "generic_name",
        "manufacturing_places_tags",
        "emb_codes_tags",
        "url",
        "creator",
        "created_datetime",
        "last_modified_datetime",
        "last_modified_by",
        "abbreviated_product_name",
        "quantity",
        "first_packaging_code_geo",
        "cities_tags",
        "purchase_places",
        "stores",
        "countries_en",
        "ingredients_tags",
        "ingredients_analysis_tags",
        "allergens_en",
        "traces_en",
        "serving_size",
        "serving_quantity",
        "no_nutrition_data",
        "additives_en",
        "nova_group",
        "pnns_groups_1",
        "pnns_groups_2",
        "food_groups_tags",
        "food_groups_en",
        "states_en",
        "brand_owner",
        "ecoscore_score",
        "ecoscore_grade",
        "nutrient_levels_tags",
        "product_quantity",
        "owner",
        "data_quality_errors_tags",
        "unique_scans_n",
        "popularity_tags",
        "completeness",
        "last_image_t",
        "last_image_datetime",
        "main_category_en",
        "image_url",
        "image_small_url",
        "image_ingredients_url",
        "image_ingredients_small_url",
        "image_nutrition_url",
        "image_nutrition_small_url",
        "energy-kj_100g",
        "energy-kcal_100g",
        "energy_100g",
        "energy-from-fat_100g",
        "fat_100g",
        "saturated-fat_100g",
        "butyric-acid_100g",
        "caproic-acid_100g",
        "caprylic-acid_100g",
        "capric-acid_100g",
        "lauric-acid_100g",
        "myristic-acid_100g",
        "palmitic-acid_100g",
        "stearic-acid_100g",
        "arachidic-acid_100g",
        "behenic-acid_100g",
        "lignoceric-acid_100g",
        "cerotic-acid_100g",
        "montanic-acid_100g",
        "melissic-acid_100g",
        "unsaturated-fat_100g",
        "monounsaturated-fat_100g",
        "polyunsaturated-fat_100g",
        "omega-3-fat_100g",
        "alpha-linolenic-acid_100g",
        "eicosapentaenoic-acid_100g",
        "docosahexaenoic-acid_100g",
        "omega-6-fat_100g",
        "linoleic-acid_100g",
        "arachidonic-acid_100g",
        "gamma-linolenic-acid_100g",
        "dihomo-gamma-linolenic-acid_100g",
        "omega-9-fat_100g",
        "oleic-acid_100g",
        "elaidic-acid_100g",
        "gondoic-acid_100g",
        "mead-acid_100g",
        "erucic-acid_100g",
        "nervonic-acid_100g",
        "trans-fat_100g",
        "cholesterol_100g",
        "carbohydrates_100g",
        "sugars_100g",
        "added-sugars_100g",
        "sucrose_100g",
        "glucose_100g",
        "fructose_100g",
        "lactose_100g",
        "maltose_100g",
        "maltodextrins_100g",
        "starch_100g",
        "polyols_100g",
        "erythritol_100g",
        "fiber_100g",
        "soluble-fiber_100g",
        "insoluble-fiber_100g",
        "proteins_100g",
        "casein_100g",
        "serum-proteins_100g",
        "nucleotides_100g",
        "salt_100g",
        "added-salt_100g",
        "sodium_100g",
        "alcohol_100g",
        "vitamin-a_100g",
        "beta-carotene_100g",
        "vitamin-d_100g",
        "vitamin-e_100g",
        "vitamin-k_100g",
        "vitamin-c_100g",
        "vitamin-b1_100g",
        "vitamin-b2_100g",
        "vitamin-pp_100g",
        "vitamin-b6_100g",
        "vitamin-b9_100g",
        "folates_100g",
        "vitamin-b12_100g",
        "biotin_100g",
        "pantothenic-acid_100g",
        "silica_100g",
        "bicarbonate_100g",
        "potassium_100g",
        "chloride_100g",
        "calcium_100g",
        "phosphorus_100g",
        "iron_100g",
        "magnesium_100g",
        "zinc_100g",
        "copper_100g",
        "manganese_100g",
        "fluoride_100g",
        "selenium_100g",
        "chromium_100g",
        "molybdenum_100g",
        "iodine_100g",
        "caffeine_100g",
        "taurine_100g",
        "ph_100g",
        "fruits-vegetables-nuts_100g",
        "fruits-vegetables-nuts-dried_100g",
        "fruits-vegetables-nuts-estimate_100g",
        "fruits-vegetables-nuts-estimate-from-ingredients_100g",
        "collagen-meat-protein-ratio_100g",
        "cocoa_100g",
        "chlorophyl_100g",
        "carbon-footprint_100g",
        "carbon-footprint-from-meat-or-fish_100g",
        "nutrition-score-fr_100g",
        "nutrition-score-uk_100g",
        "glycemic-index_100g",
        "water-hardness_100g",
        "choline_100g",
        "phylloquinone_100g",
        "beta-glucan_100g",
        "inositol_100g",
        "carnitine_100g",
        "sulphate_100g",
        "nitrate_100g",
    ]
)
Code
allbrands = []
for b in df["brands_tags"].to_list():
    if b:
        allbrands.extend(b.split(","))
brands = dict(collections.Counter(allbrands))
df_brands = pl.DataFrame({"brands": brands.keys(), "count": brands.values()}).sort(
    "count", descending=True
)
df_brands.head(10)
shape: (10, 2)
brands count
str i64
"gut-gunstig" 2943
"rewe" 1707
"edeka" 1653
"k-classic" 1489
"ja" 1303
"lidl" 1175
"alnatura" 1030
"aldi" 964
"rewe-beste-wah… 880
"muller-s-muhle… 846
Code
brand = "edeka"
df_brand = df.filter(pl.col("brands_tags").str.to_lowercase().str.contains(brand))
display(df_brand.groupby("nutriscore_grade").count().sort("nutriscore_grade"))


c = "nutriscore_grade"
color = (
    alt.Color(f"{c}:N")
    .scale(
        zero=False,
        domain=["A", "B", "C", "D", "E"],
        # scheme="darkmulti",
        range=["blue", "lightblue", "gold", "orange", "red"],
    )
    .legend(columns=1, symbolLimit=0, labelLimit=0)
)

alt.Chart(
    df.select(
        "sugars_100g", "fat_100g", "nutriscore_grade", "brands_tags"
    ).with_columns(
        pl.col("brands_tags").str.to_lowercase().str.contains(brand).alias(brand)
    )
    # .sample(5_000)
).mark_point(clip=True, filled=True).encode(
    x=alt.X("sugars_100g:Q").scale(domain=(0, 100)),
    y=alt.Y("fat_100g:Q").scale(domain=(0, 100)),
    color=color,
    shape=f"{brand}:N",
    opacity=alt.condition(alt.datum[brand], alt.value(1.0), alt.value(0.01)),
).facet(
    facet="nutriscore_grade:N", columns=3
)
shape: (5, 2)
nutriscore_grade count
str u32
"A" 404
"B" 292
"C" 401
"D" 581
"E" 288
Code
alt.Chart(
    df.select("nutriscore_score", "nutriscore_grade", "brands_tags")
    .with_columns(
        pl.col("brands_tags").str.to_lowercase().str.contains(brand).alias(brand)
    )
    .filter(pl.col(brand))
).mark_bar().encode(
    x=alt.X("nutriscore_score:Q").bin(step=1),
    y=alt.Y("count():Q"),
    color=color,
    column="nutriscore_grade:N",
)